Back to Repo


## Setup, Load Libraries & Report Versions
knitr::opts_chunk$set(echo = TRUE)

get_os <- function(){
  sysinf <- Sys.info()
  if (!is.null(sysinf)){
    os <- sysinf['sysname']
    if (os == 'Darwin')
      os <- "osx"
  } else { ## mystery machine
    os <- .Platform$OS.type
    if (grepl("^darwin", R.version$os))
      os <- "osx"
    if (grepl("linux-gnu", R.version$os))
      os <- "linux"
    if (grepl("mingw", R.version$os))
      os <- "windows"
  }
  tolower(os)
}

if (get_os()=="windows"){
  home_folder = "Z:~/viallr01/"
}else{
  home_folder = "~/"
}


## Load Libraries ------------------- 
## IMPORTANT!
## Please check the paths and files in the file "load_packages_and_functions.R" before running this!
## All .R files must be in the same folder, specified below in 'lib_folder'
lib_folder = paste0(home_folder,"ad-omics/ricardo/MyRepo/structuralvariation/R_Library/")
source(paste0(lib_folder,"load_packages_and_functions.R"))

Long read SV results

QC results for two ROSMAP samples sequenced using PacBio CLR (Sequel 2)

SM_CJEK6

p_load(magick)

results_path = paste0(home_folder,"ad-omics/ricardo/ROSMAP_LongRead/SV_Analysis/00_QC/raw/")

ggdraw() + draw_image(paste0(results_path,"SM_CJEK6","/HistogramReadlength.png"), scale = 1)

stats = read.delim2(paste0(results_path,"SM_CJEK6","/NanoStats.txt"))
data.frame(Summary = gsub("(.*):(.*)","\\1",stats[1:7,]), Value = gsub("(.*):(.*)","\\2",stats[1:7,]) %>% trim.spaces())

SM_CJK3B

results_path = paste0(home_folder,"ad-omics/ricardo/ROSMAP_LongRead/SV_Analysis/00_QC/raw/")

ggdraw() + draw_image(paste0(results_path,"SM_CJK3B","/HistogramReadlength.png"), scale = 1)

stats = read.delim2(paste0(results_path,"SM_CJK3B","/NanoStats.txt"))
data.frame(Summary = gsub("(.*):(.*)","\\1",stats[1:7,]), Value = gsub("(.*):(.*)","\\2",stats[1:7,]) %>% trim.spaces())

SV calls using SVIM (not filtered)

samples.id = c("SM_CJEK6","SM_CJK3B")
results_path = paste0(home_folder,"ad-omics/ricardo/ROSMAP_LongRead/SV_Analysis/")

p_load(magick)

plot_sample <- list()
for (sampleID in samples.id){
  p1 <- ggdraw() + draw_image(paste0(results_path,"01_mapping/h37/",sampleID,".svim/sv-genotypes-q5.png"), scale = 1)
  p2 <- ggdraw() + draw_image(paste0(results_path,"01_mapping/h37/",sampleID,".svim/sv-lengths-q5.png"), scale = 1)
  
  title <- ggdraw() + 
    draw_label(
      sampleID,
      fontface = 'bold',
      x = 0,
      hjust = 0
    ) + theme(plot.margin = margin(0, 0, 0, 7))
  
  plot_row <- plot_grid(p1, p2)
  
  plot_sample[[sampleID]] <- plot_grid(title, plot_row,
          ncol = 1,
          rel_heights = c(0.1, 1))
}
plot_grid(plotlist = plot_sample, ncol = 1)

Benchmarking

Vapor

bench_dir = paste0(home_folder,"ad-omics/ricardo/ROSMAP_LongRead/ShortRead_results/final_gt/")
svtypes = c("DEL","DUP","INS","INV")

## INVs are mixed with complex SVs. We aggregate measures for INVs + different complex types
# ml R/3.6.2 bedtools bcftools picard samtools parallel
# samples=( "SM_CJK3B" "SM_CJEK6" )
# svtypes=( "DUP_INV" "DEL_INV" "DEL_DUP_INV" "DISDUP" )
# for sample in "${samples[@]}"; do
#   for svtype in "${svtypes[@]}"; do
#     sed "s/INV/${svtype}/g" ${sample}.gt.INV.filt.bed > ${sample}.gt.${svtype}.filt.bed
#     echo "rm -rf ${sample}_vapor_${svtype}; vapor bed --sv-input ${sample}.gt.${svtype}.filt.bed --output-path ${sample}_vapor_${svtype}/ --reference ~/ad-omics/ricardo/Data/bayestyper_GRCh37_bundle_v1.3/GRCh37_canon.fa --pacbio-input ~/ad-omics/ricardo/ROSMAP_LongRead/ShortRead_results/${sample}.mm2.sorted.bam" | bsub -n 1 -R 'span[hosts=1]' -R 'rusage[mem=4000]' -P acc_ad-omics -W 12:00 -oo ${sample}_vapor_${svtype}.out -eo ${sample}_vapor_${svtype}.err
#   done
# done

# SM_CJK3B
vapor_res = list()

for (svtype in svtypes){
  vapor_tmp0 = read.table(paste0(bench_dir,"SM_CJK3B.gt.",svtype,".filt.bed.vapor"), header = T) 
  if(svtype == "INV"){
    vapor_tmp_cpx = vapor_tmp0 %>% mutate(conf = VaPoR_GS>0 | VaPoR_GT!="0/0")

    cpx_types = c("DUP_INV","DEL_INV","DEL_DUP_INV","DISDUP")
    
    for(cpx_type in cpx_types){
      vapor_tmp_cpx0 = read.table(paste0(bench_dir,"SM_CJK3B.gt.",cpx_type,".filt.bed.vapor"), header = T) 
      vapor_tmp_cpx1 = vapor_tmp_cpx0 %>% mutate(conf = VaPoR_GS>0 | VaPoR_GT!="0/0")
      vapor_tmp_cpx$conf = vapor_tmp_cpx$conf | vapor_tmp_cpx1$conf
    }
    
  vapor_tmp_cpx$VaPoR_GS[vapor_tmp_cpx$conf] <- 1
  vapor_tmp_cpx$VaPoR_GT[vapor_tmp_cpx$conf] <- "0/1"
  vapor_res[[svtype]] = vapor_tmp_cpx
    
  }else{
    vapor_tmp0$conf = T
    vapor_res[[svtype]] = vapor_tmp0
  }
}

df_vapor = data.frame(Sample = "SM_CJK3B", SVTYPE = names(vapor_res), N = sapply(vapor_res, nrow))

df_conf_rate = Reduce(full_join, vapor_res) %>% drop_na() %>% group_by(SVTYPE) %>% dplyr::summarise(Sample = "SM_CJK3B", avg = (sum(VaPoR_GS>0 | VaPoR_GT!="0/0")/length(VaPoR_GS)), n = n())

df_conf_rate = rbind(df_conf_rate , Reduce(full_join, vapor_res) %>% drop_na() %>% dplyr::summarise(Sample = "SM_CJK3B", SVTYPE = "ALL", avg = (sum(VaPoR_GS>0 | VaPoR_GT!="0/0")/length(VaPoR_GS)), n = n()))

# SM_CJEK6

vapor_res = list()
for (svtype in svtypes){
  vapor_tmp0 = read.table(paste0(bench_dir,"SM_CJEK6.gt.",svtype,".filt.bed.vapor"), header = T) 
  if(svtype == "INV"){
    vapor_tmp_cpx = vapor_tmp0 %>% mutate(conf = VaPoR_GS>0 | VaPoR_GT!="0/0")
    cpx_types = c("DUP_INV","DEL_INV","DEL_DUP_INV","DISDUP")
    for(cpx_type in cpx_types){
      vapor_tmp_cpx0 = read.table(paste0(bench_dir,"SM_CJEK6.gt.",cpx_type,".filt.bed.vapor"), header = T) 
      vapor_tmp_cpx1 = vapor_tmp_cpx0 %>% mutate(conf = VaPoR_GS>0 | VaPoR_GT!="0/0")
      vapor_tmp_cpx$conf = vapor_tmp_cpx$conf | vapor_tmp_cpx1$conf
    }
    
  vapor_tmp_cpx$VaPoR_GS[vapor_tmp_cpx$conf] <- 1
  vapor_tmp_cpx$VaPoR_GT[vapor_tmp_cpx$conf] <- "0/1"
  vapor_res[[svtype]] = vapor_tmp_cpx
    
  }else{
    vapor_tmp0$conf = T
    vapor_res[[svtype]] = vapor_tmp0
  }
}

df_vapor = rbind(df_vapor,data.frame(Sample = "SM_CJEK6", SVTYPE = names(vapor_res), N = sapply(vapor_res, nrow)))

df_conf_rate = rbind(df_conf_rate, Reduce(full_join, vapor_res) %>% drop_na() %>% group_by(SVTYPE) %>% dplyr::summarise(Sample = "SM_CJEK6", avg = (sum(VaPoR_GS>0 | VaPoR_GT!="0/0")/length(VaPoR_GS)), n = n()))

df_conf_rate = rbind(df_conf_rate, Reduce(full_join, vapor_res) %>% drop_na() %>% dplyr::summarise(Sample = "SM_CJEK6", SVTYPE = "ALL", avg = (sum(VaPoR_GS>0 | VaPoR_GT!="0/0")/length(VaPoR_GS)), n = n()))

# Prepare plot
df_conf_rate$SVTYPE = as.character(df_conf_rate$SVTYPE)
df_conf_rate$SVTYPE[df_conf_rate$SVTYPE=="TANDUP"] <- "DUP"
df_conf_rate$SVTYPE = factor(df_conf_rate$SVTYPE, level = c("ALL","DEL","DUP","INS","INV"))
df_conf_rate_avg = df_conf_rate %>% group_by(SVTYPE) %>% dplyr::summarise(avg = median(avg))

c.1 <- ggplot(df_conf_rate[df_conf_rate$SVTYPE!="ALL",], aes(y = Sample, x = n, fill = SVTYPE)) +
  geom_bar(stat = "identity", position = "stack", color = "black", width = .7) +
  scale_fill_npg() + 
  theme_tufte(base_family = "Helvetica") +
  scale_x_continuous(breaks = c(0, 500, 1000, 1500),
                     label = c("0", "500", "1k", "1.5k")) +
  labs(x = "Number of SVs evaluated", y = "Sample", fill = NULL) +
  easy_remove_legend()

c.2 <- ggplot(df_conf_rate, aes(y = avg, x = SVTYPE)) +
  geom_point(alpha = 0.6) +
  stat_summary(fun.y = median, 
               geom = "crossbar", 
               width = 0.5,
               color = "#CB454A") +
  geom_vline(xintercept= 1.5, colour = "gray") +
  geom_text(data = df_conf_rate_avg, aes(label = scales::percent(avg), y = avg+.1),size = 3.5) +
  theme_tufte(base_family = "Helvetica") +
  expand_limits(y = c(0.3,1.1)) +
  labs(y = "Confirmation Rate", x = "SV type")

ggarrange(c.1, c.2, heights = c(1,1.5), ncol = 1, nrow = 2)

Session info

sessionInfo()
## R version 4.0.3 (2020-10-10)
## Platform: x86_64-apple-darwin17.0 (64-bit)
## Running under: macOS Big Sur 10.16
## 
## Matrix products: default
## BLAS:   /Library/Frameworks/R.framework/Versions/4.0/Resources/lib/libRblas.dylib
## LAPACK: /Library/Frameworks/R.framework/Versions/4.0/Resources/lib/libRlapack.dylib
## 
## locale:
## [1] en_US.UTF-8/en_US.UTF-8/en_US.UTF-8/C/en_US.UTF-8/en_US.UTF-8
## 
## attached base packages:
##  [1] splines   stats4    parallel  grid      stats     graphics  grDevices
##  [8] utils     datasets  methods   base     
## 
## other attached packages:
##   [1] magick_2.7.1                org.Hs.eg.db_3.12.0        
##   [3] GOSemSim_2.16.1             clusterProfiler_3.18.1     
##   [5] pipeR_0.6.1.3               FactoMineR_2.4             
##   [7] missMDA_1.18                RcppCNPy_0.2.10            
##   [9] psycho_0.6.1                cqn_1.36.0                 
##  [11] quantreg_5.83               SparseM_1.78               
##  [13] preprocessCore_1.52.1       nor1mix_1.3-0              
##  [15] mclust_5.4.7                gdata_2.18.0               
##  [17] zoo_1.8-8                   qqman_0.1.4                
##  [19] gridExtra_2.3               wesanderson_0.3.6          
##  [21] ggrepel_0.9.1               rlist_0.4.6.1              
##  [23] ComplexHeatmap_2.6.2        circlize_0.4.12            
##  [25] gridBase_0.4-7              patchwork_1.1.1            
##  [27] ggeasy_0.1.3                treemap_2.4-2              
##  [29] GWASdata_1.28.0             GWASTools_1.36.0           
##  [31] ggupset_0.3.0               boot_1.3-25                
##  [33] wiggleplotr_1.14.0          Gviz_1.34.1                
##  [35] edgeR_3.32.1                qvalue_2.22.0              
##  [37] ggthemes_4.2.4              factoextra_1.0.7           
##  [39] vroom_1.4.0                 forcats_0.5.1              
##  [41] purrr_0.3.4                 readr_1.4.0                
##  [43] tibble_3.0.6                tidyverse_1.3.0            
##  [45] compare_0.2-6               superheat_0.1.0            
##  [47] abind_1.4-5                 splitstackshape_1.4.8      
##  [49] vcfR_1.12.0                 ggExtra_0.9                
##  [51] ggalluvial_0.12.3           ggallin_0.1.1              
##  [53] SeqArray_1.30.0             SNPRelate_1.24.0           
##  [55] gdsfmt_1.26.1               hablar_0.3.0               
##  [57] viridis_0.5.1               viridisLite_0.3.0          
##  [59] pheatmap_1.0.12             ShortRead_1.48.0           
##  [61] GenomicAlignments_1.26.0    Rsamtools_2.6.0            
##  [63] ggstance_0.3.5              aod_1.3.1                  
##  [65] readxl_1.3.1                ggsignif_0.6.0             
##  [67] venn_1.10                   rtracklayer_1.50.0         
##  [69] VennDiagram_1.6.20          futile.logger_1.4.3        
##  [71] UpSetR_1.4.0                RColorBrewer_1.1-2         
##  [73] mdatools_0.11.3             robustbase_0.93-7          
##  [75] lmerTest_3.1-3              Kendall_2.2                
##  [77] Amelia_1.7.6                magrittr_2.0.1             
##  [79] flextable_0.6.3             gtools_3.8.2               
##  [81] ggfortify_0.4.11            Hmisc_4.5-0                
##  [83] Formula_1.2-4               skimr_2.1.3                
##  [85] ggbeeswarm_0.6.0            memisc_0.99.27.3           
##  [87] lattice_0.20-41             ggiraph_0.7.8              
##  [89] tables_0.9.6                DT_0.17                    
##  [91] kableExtra_1.3.1            knitr_1.31                 
##  [93] reshape2_1.4.4              ggsci_2.9                  
##  [95] ggpubr_0.4.0                cowplot_1.1.1              
##  [97] uuid_0.1-4                  gghalves_0.1.1             
##  [99] sitools_1.4                 OUTRIDER_1.8.0             
## [101] SummarizedExperiment_1.20.0 MatrixGenerics_1.2.1       
## [103] matrixStats_0.58.0          GenomicFeatures_1.42.2     
## [105] AnnotationDbi_1.52.0        rstatix_0.6.0              
## [107] qPLEXanalyzer_1.8.2         MSnbase_2.16.1             
## [109] ProtGenerics_1.22.0         mzR_2.24.1                 
## [111] Rcpp_1.0.6                  proDA_1.4.0                
## [113] GGally_2.1.1                ggridges_0.5.3             
## [115] arules_1.6-7                glmnet_4.1-1               
## [117] gam_1.20                    arm_1.11-2                 
## [119] lme4_1.1-26                 MASS_7.3-53                
## [121] bumphunter_1.32.0           locfit_1.5-9.4             
## [123] iterators_1.0.13            foreach_1.5.1              
## [125] snpStats_1.40.0             Matrix_1.2-18              
## [127] survival_3.2-7              TnT_1.12.0                 
## [129] regioneR_1.22.0             GenomicRanges_1.42.0       
## [131] GenomeInfoDb_1.26.4         variancePartition_1.20.0   
## [133] BiocParallel_1.24.1         limma_3.46.0               
## [135] janitor_2.1.0               Biostrings_2.58.0          
## [137] XVector_0.30.0              IRanges_2.24.1             
## [139] S4Vectors_0.28.1            multtest_2.46.0            
## [141] Biobase_2.50.0              BiocGenerics_0.36.0        
## [143] stargazer_5.2.2             pander_0.6.3               
## [145] stringr_1.4.0               dplyr_1.0.4                
## [147] tidyr_1.1.2                 data.table_1.13.6          
## [149] ggplotify_0.0.5             sjPlot_2.8.7               
## [151] pacman_0.5.1                proteus_0.2.14             
## [153] proteusSILAC_0.1.0          proteusTMT_0.1.1           
## [155] proteusLabelFree_0.1.6      ggbiplot_0.55              
## [157] scales_1.1.1                plyr_1.8.6                 
## [159] ggmosaic_0.3.3              autoplot_0.1               
## [161] broom_0.7.4                 ggplot2_3.3.3              
## [163] safejoin_0.1.0              seqUtils_0.0.0.9000        
## [165] bsselectR_0.1.0             devtools_2.3.2             
## [167] usethis_2.0.0              
## 
## loaded via a namespace (and not attached):
##   [1] genefilter_1.72.1        sjlabelled_1.1.7         remotes_2.2.0           
##   [4] vctrs_0.3.6              vsn_3.58.0               blob_1.2.1              
##   [7] withr_2.4.1              foreign_0.8-80           gdtools_0.2.3           
##  [10] registry_0.5-1           rvcheck_0.1.8            lifecycle_0.2.0         
##  [13] emmeans_1.5.5-1          cellranger_1.1.0         munsell_0.5.0           
##  [16] DESeq2_1.30.1            codetools_0.2-16         seriation_1.2-9         
##  [19] lmtest_0.9-38            flashClust_1.01-2        DO.db_2.9               
##  [22] annotate_1.68.0          fs_1.5.0                 fastmatch_1.1-0         
##  [25] impute_1.64.0            formula.tools_1.7.1      biovizBase_1.38.0       
##  [28] stringi_1.5.3            polyclip_1.10-0          sandwich_3.0-0          
##  [31] ggraph_2.0.5             cluster_2.1.0            ape_5.4-1               
##  [34] pkgconfig_2.0.3          prettyunits_1.1.1        lubridate_1.7.9.2       
##  [37] estimability_1.3         httr_1.4.2               igraph_1.2.6            
##  [40] progress_1.2.2           GetoptLong_1.0.5         graphlayouts_0.7.1      
##  [43] haven_2.3.1              snakecase_0.11.0         parameters_0.12.0       
##  [46] htmltools_0.5.1.1        miniUI_0.1.1.1           GWASExactHW_1.01        
##  [49] yaml_2.2.1               pillar_1.4.7             later_1.1.0.1           
##  [52] glue_1.4.2               DBI_1.1.1                affy_1.68.0             
##  [55] doRNG_1.8.2              gtable_0.3.0             pcaMethods_1.82.0       
##  [58] caTools_1.18.1           GlobalOptions_0.1.2      latticeExtra_0.6-29     
##  [61] fastmap_1.1.0            checkmate_2.0.0          promises_1.1.1          
##  [64] webshot_0.5.2            ggforce_0.3.2            hms_1.0.0               
##  [67] askpass_1.1              png_0.1-7                ncdf4_1.17              
##  [70] clue_0.3-58              lazyeval_0.2.2           crayon_1.4.0            
##  [73] heatmaply_1.2.1          reprex_1.0.0             MALDIquant_1.19.3       
##  [76] colorRamps_2.3           tidyselect_1.1.0         xfun_0.20               
##  [79] performance_0.7.0        VariantAnnotation_1.36.0 operator.tools_1.6.3    
##  [82] rappdirs_0.3.3           bit64_4.0.5              rngtools_1.5            
##  [85] lambda.r_1.2.4           modelr_0.1.8             affyio_1.60.0           
##  [88] jpeg_0.1-8.1             mzID_1.28.0              sjmisc_2.8.6            
##  [91] MatrixModels_0.4-1       leaps_3.1                permute_0.9-5           
##  [94] PRROC_1.3.1              htmlTable_2.1.0          pinfsc50_1.2.0          
##  [97] xtable_1.8-4             admisc_0.12              cachem_1.0.3            
## [100] DelayedArray_0.16.2      vegan_2.5-7              systemfonts_1.0.1       
## [103] mime_0.10                rjson_0.2.20             processx_3.4.5          
## [106] insight_0.13.1           numDeriv_2016.8-1.1      tools_4.0.3             
## [109] sjstats_0.18.1           cli_2.3.0                dichromat_2.0-0         
## [112] assertthat_0.2.1         officer_0.3.16           logistf_1.24            
## [115] fgsea_1.16.0             repr_1.1.3               DNAcopy_1.64.0          
## [118] BiocFileCache_1.14.0     mgcv_1.8-33              plotly_4.9.3            
## [121] tweenr_1.0.1             ensembldb_2.14.0         effectsize_0.4.4        
## [124] zlibbioc_1.36.0          zip_2.1.1                hwriter_1.3.2           
## [127] bayestestR_0.8.2         rio_0.5.16               shadowtext_0.0.7        
## [130] biomaRt_2.46.3           geneplotter_1.68.0       ps_1.5.0                
## [133] tidygraph_1.2.0          conquer_1.0.2            KernSmooth_2.23-17      
## [136] backports_1.2.1          quantsmooth_1.56.0       scatterplot3d_0.3-41    
## [139] farver_2.0.3             bit_4.0.4                gplots_3.1.1            
## [142] openxlsx_4.2.3           shiny_1.6.0              scatterpie_0.1.5        
## [145] DOSE_3.16.0              futile.options_1.0.1     dendextend_1.14.0       
## [148] downloader_0.4           rstudioapi_0.13          minqa_1.2.4             
## [151] AnnotationFilter_1.14.0  nlme_3.1-149             shape_1.4.5             
## [154] beeswarm_0.3.1           generics_0.1.0           colorspace_2.0-0        
## [157] base64enc_0.1-3          XML_3.99-0.5             pkgbuild_1.2.0          
## [160] dbplyr_2.1.0             calibrate_1.7.7          GenomeInfoDbData_1.2.4  
## [163] evaluate_0.14            memoise_2.0.0            coda_0.19-4             
## [166] doParallel_1.0.16        vipor_0.4.5              httpuv_1.5.5            
## [169] openssl_1.4.3            BiocManager_1.30.10      formatR_1.7             
## [172] pkgload_1.1.0            jsonlite_1.7.2           digest_0.6.27           
## [175] BBmisc_1.11              rprojroot_2.0.2          bitops_1.0-6            
## [178] RSQLite_2.2.4            rmarkdown_2.6            compiler_4.0.3          
## [181] nnet_7.3-14              statmod_1.4.35           carData_3.0-4           
## [184] testthat_3.0.1           gridGraphics_0.5-1       rlang_0.4.10            
## [187] nloptr_1.2.2.2           sessioninfo_1.1.1        rvest_0.3.6             
## [190] mvtnorm_1.1-1            htmlwidgets_1.5.3        labeling_0.4.2          
## [193] callr_3.5.1              Cairo_1.5-12.2           curl_4.3                
## [196] pbkrtest_0.5-0.1         highr_0.8                DEoptimR_1.0-8          
## [199] BSgenome_1.58.0          desc_1.2.0               ggeffects_1.0.2         
## [202] enrichplot_1.10.2        RCurl_1.98-1.3           mice_3.13.0             
## [205] ggdendro_0.1.22          GO.db_3.12.1             car_3.0-10              
## [208] ellipsis_0.3.1           xml2_1.3.2               reshape_0.8.8           
## [211] rpart_4.1-15             R6_2.5.0                 TSP_1.1-10